{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Homework 7: Decision Tree\n",
    "Isac do Nascimento Lira, 371890"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "1. Utilizamos a medida de Entropia como fator de decisão (medida de impureza de um nó). Teste o mesmo conjunto \n",
    "randômico de dados para a medida Gini e compare os resultados."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import math\n",
    "import numpy as np\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1728 entries, 0 to 1727\n",
      "Data columns (total 6 columns):\n",
      "buying      1728 non-null object\n",
      "maint       1728 non-null object\n",
      "doors       1728 non-null object\n",
      "persons     1728 non-null object\n",
      "lug_boot    1728 non-null object\n",
      "safety      1728 non-null object\n",
      "dtypes: object(6)\n",
      "memory usage: 81.1+ KB\n"
     ]
    }
   ],
   "source": [
    "columns = [\"buying\", \"maint\", \"doors\", \"persons\",\"lug_boot\", \"safety\"]\n",
    "cardf = pd.read_csv(\"car_data.csv\", names=columns)\n",
    "cardf = cardf.sample(frac=1).reset_index(drop=True)\n",
    "cardf.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>buying</th>\n",
       "      <th>maint</th>\n",
       "      <th>doors</th>\n",
       "      <th>persons</th>\n",
       "      <th>lug_boot</th>\n",
       "      <th>safety</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>vhigh</td>\n",
       "      <td>5more</td>\n",
       "      <td>more</td>\n",
       "      <td>big</td>\n",
       "      <td>low</td>\n",
       "      <td>unacc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>med</td>\n",
       "      <td>5more</td>\n",
       "      <td>2</td>\n",
       "      <td>small</td>\n",
       "      <td>high</td>\n",
       "      <td>unacc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>low</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>big</td>\n",
       "      <td>low</td>\n",
       "      <td>unacc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>low</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>med</td>\n",
       "      <td>high</td>\n",
       "      <td>unacc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>med</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>small</td>\n",
       "      <td>med</td>\n",
       "      <td>acc</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  buying  maint doors persons lug_boot safety\n",
       "0  vhigh  5more  more     big      low  unacc\n",
       "1    med  5more     2   small     high  unacc\n",
       "2    low      3     4     big      low  unacc\n",
       "3    low      2     2     med     high  unacc\n",
       "4    med      2     4   small      med    acc"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cardf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Transforma as features categóricas em numéricas\n",
    "\n",
    "cat_features = ['buying','maint','doors','persons','lug_boot','safety']\n",
    "for feature in cat_features:\n",
    "    cardf[feature] = cardf[feature].astype('category').cat.codes\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Divide os dados em treino e teste\n",
    "\n",
    "msk = np.random.rand(len(cardf))<0.7\n",
    "ytrain = cardf['class'].iloc[msk]\n",
    "xtrain = cardf[msk].drop(['class'],axis=1)\n",
    "\n",
    "ytest = cardf['class'].iloc[~msk]\n",
    "xtest = cardf[~msk].drop(['class'],axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prediction Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score using entropy criterion: 0.975925925926\n",
      "Score using gini criterion: 0.975925925926\n"
     ]
    }
   ],
   "source": [
    "# Classifica usando árvode de decisão com criério baseado em entropy/gini\n",
    "\n",
    "dtree_entropy = DecisionTreeClassifier(criterion = 'entropy')\n",
    "dtree_entropy.fit(xtrain,ytrain)\n",
    "entropy_score = dtree_entropy.score(xtest,ytest)\n",
    "print('Score using entropy criterion:',entropy_score)\n",
    "\n",
    "dtree_gini = DecisionTreeClassifier(criterion = 'gini')\n",
    "dtree_gini.fit(xtrain,ytrain)\n",
    "gini_score = dtree.score(xtest,ytest)\n",
    "print('Score using gini criterion:',gini_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEKCAYAAABUsYHRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHO1JREFUeJzt3X2UlXW99/H3R0SHp0QBORXGgLcPoQLKaOoRblAr8yGp\n9NbEkHQ1ISxL1/E2K0tuy44eW8lRS0NXogs7WilqnZOeRIcUfBqQR0MpgSN2DiKaIIoJfu8/9jWw\nHffM7JnZe357Zj6vtfaa6/n6/mZnH37X9ZvrUkRgZmbW0XZLXYCZmXVPDiAzM0vCAWRmZkk4gMzM\nLAkHkJmZJeEAMjOzJBxAZmaWhAPIzMyScACZmVkSu6cuoJINHDgwqqurU5dhZtapLFq06LWIGNTS\ndg6gZlRXV1NfX5+6DDOzTkXSumK28yU4MzNLwgFkZmZJOIDMzCwJB5CZmSXhADIzsyQcQGZmloQD\nyMzMknAAmZlZEv5D1Gas2bCRc2fOSl2GmVmHGDc297N2TG2HnM89IDMzS8IBZGZmSTiAzMwsCQeQ\nmZkl4QAyM7MkHEBmZpaEA8jMzJJwAJmZWRIOIDMzS6JTBpCkgyUtkfScpP2b2e47HVmXmZkVr1MG\nEDAR+E1EHB4Rf2lmOweQmVmFqphnwUnqA/wKGAL0AH4AHAScBvQCFgJfBz4HXAzskHRCREyQdC7w\nDWAP4GlgGnA10EvSEmAl8Bfg9YiYmZ3vauDViPjXjmulmZk1qKQe0EnAXyNiVEQcCjwE3BQRR2bz\nvYBTI+I/gFuA67Pw+SRwFvCPETEa2AFMiojLgXciYnRETAJ+AUwGkLQbcDYwp6MbaWZmOZUUQMuB\nT0u6VtLYiHgTmCDpaUnLgeOBQwrsdwIwBng26+2cAAxvvFFErAU2SToc+AzwXERsarydpFpJ9ZLq\nt219q2SNMzOzD6qYS3AR8aKkI4CTgR9KmgdMB2oi4mVJM4CqArsKuCMivl3EaW4DpgD/QK5HVKiO\nWcAsgAH7DY3WtsPMzIpTMT0gSR8D3o6IOcB1wBHZqtck9QXOaGLXecAZkvbNjrOPpKHZuvck9czb\ndi65S31HAg+Xug1mZla8iukBAYcB10l6H3gPuJDcaLcVwP8AzxbaKSKel3QF8J/ZvZ33yPWc1pHr\nySyTtDgiJkXE3yU9BvwtInaUv0lmZtaUigmgiHiYD/dK6oErCmw7o9H8PcA9Bbb7FvCthvksoI4G\nzmx/xWZm1h4Vcwmu3CSNAP4MzIuI1anrMTPr7iqmB1RuEfE8BUbHmZlZGt2mB2RmZpXFAWRmZkk4\ngMzMLAkHkJmZJeEAMjOzJBxAZmaWhAPIzMyS6DZ/B9QWwwYPYs7FtanLMDPrktwDMjOzJBxAZmaW\nhAPIzMyScACZmVkSDiAzM0vCo+CasWbDRs6dOSt1GdZO43o3PZKx1oMczZJxD8jMzJJwAJmZWRIO\nIDMzS8IBZGZmSTiAzMwsCQeQmZkl4QAyM7MkHEBmZpaEA8jMzJJwAJmZWRJlCyBJb5X4eDMkXVqC\n41RLOqcUNZmZWdt1xx5QNeAAMjNLrOwBJGm8pN/lzd8kaUo2fbKkVZIWSbohf7smjJL0pKTVkr6W\nHUOSrpO0QtJySWc1txy4BhgraYmkS0rfYjMzK0ayp2FLqgJ+DoyLiDWS/q2I3UYCRwN9gOck/Ttw\nDDAaGAUMBJ6V9Efg2CaWXw5cGhGnNlFXLVAL0HvvfdrRQjMza07KS3AHAy9FxJpsvpgAeiAi3omI\n14DHgKOA44B/i4gdEbEBmA8c2czyZkXErIioiYiaqj5929AsMzMrRkcE0PZG56lqx7GihXkzM+sk\nOiKA1gEjJO0pqT9wQrb8BWC4pOps/qwC+zZ2uqQqSQOA8cCzwOPAWZJ6SBoEjAOeaWb5FqBfSVpm\nZmZtVvZ7QBHxsqRfASuANcBz2fJ3JE0DHpK0lVyYtGQZuUtvA4EfRMRfJc0ldx9oKbke0WUR8T/N\nLN8E7JC0FJgdEdeXtMFmZlYURaS7iiWpb0S8JUnAT4HVlRQIA/YbGp/7p++mLsPaya/kNutYkhZF\nRE1L26X+O6CvSVoCrAT2IjcqzszMuoFkw7ABst7OB3o8kr4KfLPRpgsiYnqHFWZmZmWXNIAKiYjb\ngdtT12FmZuWV+hKcmZl1Uw4gMzNLwgFkZmZJOIDMzCwJB5CZmSVRcaPgKsmwwYOYc7H/UtHMrBzc\nAzIzsyQcQGZmloQDyMzMknAAmZlZEg4gMzNLwqPgmrFmw0bOnTkrdRlmXdq4sWnOWzvGI1xTcw/I\nzMyScACZmVkSDiAzM0vCAWRmZkk4gMzMLAkHkJmZJeEAMjOzJBxAZmaWhAPIzMyScACZmVkSFR9A\nkvy4IDOzLqhDAkhStaRVku6S9CdJv5HUW9IYSfMlLZL0sKSPZtvXSZopqR74pqQzJa2QtFTSH7Nt\nqiTdLmm5pOckTciWT5F0n6SHJK2W9C/Z8h6SZmfHWS7pko5ou5mZFdaRvYuDgAsiYoGkXwDTgS8A\np0fERklnAVcD52fb7xERNQCSlgOfjYhXJPXP1k8HIiIOk3Qw8J+SDszWjQYOB94FXpB0I7Av8PGI\nODQ7ZsNxzMwsgY68BPdyRCzIpucAnwUOBf4gaQlwBTAkb/t78qYXALMlfQ3okS07LjsOEbEKWAc0\nBNC8iHgzIrYBzwNDgZeA4ZJulHQSsLlQkZJqJdVLqt+29a32tdjMzJpUVABJOlDSPEkrsvmRkq5o\n5bmi0fwWYGVEjM4+h0XEZ/LWb925Y8RUcgG1H7BI0oAWzvVu3vQOYPeIeAMYBdQBU4HbChYZMSsi\naiKipqpP32LaZWZmbVBsD+hW4NvAewARsQw4u5Xn+oSkY7Lpc4CngEENyyT1lHRIoR0l7R8RT0fE\n94GN5ILocWBStv5A4BPAC02dXNJAYLeIuJdcmB3RyvrNzKyEir0H1DsinpGUv2x7K8/1AjA9u//z\nPHAj8DBwg6S9slpmAisL7HudpAMAAfOApcAq4Obs/tB2YEpEvNuoxnwfB26X1BC6325l/WZmVkLF\nBtBrkvYnu4wm6Qzgv1t5ru0RcW6jZUuAcY03jIjxjea/WOB424CvFth3NjA7b/7UvNXu9ZiZVYhi\nA2g6MAs4WNIrwBqyy19mZmZt0WIAZZesaiLiREl9yN1H2dKak0TEWnIj3szMzIAiBiFExPvAZdn0\n1taGj5mZWSHFjoJ7RNKlkvaTtE/Dp6yVmZlZl1bsPaCzsp/T85YFMLy05ZiZWXdRVABFxLByF2Jm\nZt1LUQEkaXKh5RFxZ2nLMTOz7qLYS3BH5k1XAScAiwEHkJmZtUmxl+Auyp/PniR9d1kqMjOzbqGt\nr2PYCnT5+0LDBg9izsW1qcswM+uSir0H9Ft2Pc16N2AE8OtyFWVmZl1fsT2gH+dNbwfWRcT6MtRj\nZmbdRLF/iHpyRMzPPgsiYr2ka8tamZmZdWnFBtCnCyz7XCkLMTOz7qXZS3CSLgSmkXuV9bK8Vf3I\nvSbbzMysTRTR+E3ZeStzL4rbG/hn4PK8VVsi4vUy15bcgP2Gxuf+6bupy+hQ48bumq4d4xGAZtZ6\nkhZFRE1L2zXbA4qIN4E3gS9nB92X3B+i9pXUNyL+qxTFmplZ91PUPSBJp0laTe5FdPOBtcDvy1iX\nmZl1ccUOQvghcDTwYvZg0hOAp8pWlZmZdXnFBtB7EbEJ2E3SbhHxGNDi9T0zM7OmFPuHqH+T1Bd4\nHLhL0qvkHsdjZmbWJsX2gE4H3gYuBh4C/gKcVq6izMys6yv2adhbJQ0FDoiIOyT1BnqUtzQzM+vK\nih0F9zXgN8DPs0UfB+4vV1FmZtb1FXsJbjrwj8BmgIhYDexbrqLMzKzrKzaA3o2IvzfMSNqdXa9n\nKDtJMyRd2lHnMzOz8is2gOZL+g7QS9Knyb0L6LflK6v0stA0M7MKUWwAXQ5sBJYDXwf+A7iiXEUB\nSPqupBclPQEclC0bLekpScskzZW0dwvL6yTNlFQPfFPSmZJWSFoq6Y/lrN/MzJrX0tOwPxER/xUR\n7wO3Zp+ykzQGOBsYTa7GxcAi4E7gooiYL+kq4EpyQ8ObWg6wR8ND8SQtBz4bEa9I6t8RbTEzs8Ja\n6gHtHOkm6d4y15JvLDA3It6OiM3Ag0AfoH9EzM+2uQMYlz2x+0PL8451T970AmB2Nqqv4DBySbWS\n6iXVb9v6VgmbZGZm+VoKIOVNDy9nIWW084kNETGV3KXD/YBFkgY03jgiZkVETUTUVPXp24Flmpl1\nLy0FUDQxXW5/BCZK6iWpH7mnLmwF3pDU8MaarwDzs1dGfGh5oYNK2j8ino6I75O7p7VfWVthZmZN\namlk2ChJm8n1hHpl02TzEREfKUdREbFY0j3AUuBV4Nls1XnALdmTGF4CvtrC8sauk3RAVv+87Phm\nZpZASy+kS/a4nYi4Gri6wKqjC2y7pInl4xvNf7FU9ZmZWfsUOwzbzMyspBxAZmaWhAPIzMyScACZ\nmVkSDiAzM0vCAWRmZkk4gMzMLAkHkJmZJeEAMjOzJPyStmYMGzyIORfXpi7DzKxLcg/IzMyScACZ\nmVkSDiAzM0vCAWRmZkk4gMzMLAkHkJmZJeFh2M1Ys2Ej586clboMs25tXG//KURtF/0VuAdkZmZJ\nOIDMzCwJB5CZmSXhADIzsyQcQGZmloQDyMzMknAAmZlZEg4gMzNLwgFkZmZJdKkAklQj6YYWtukv\naVpH1WRmZoV1qQCKiPqI+EYLm/UHHEBmZolVXABJqpa0StJsSS9KukvSiZIWSFot6ajs86Sk5yQt\nlHRQtu94Sb/LpmdI+oWkOkkvSWoIpmuA/SUtkXRdqnaamXV3lfow0v8FnAmcDzwLnAMcB3we+A4w\nGRgbEdslnQj8CPhSgeMcDEwA+gEvSLoZuBw4NCJGFzqxpFqgFqD33vuUsk1mZpanUgNoTUQsB5C0\nEpgXESFpOVAN7AXcIekAIICeTRzn3yPiXeBdSa8Cg1s6cUTMAmYBDNhvaLS7JWZmVlDFXYLLvJs3\n/X7e/PvkQvMHwGMRcShwGlBVxHF2ULmBa2bW7VRqALVkL+CVbHpKK/fdQu6SnJmZJdRZA+hfgH+W\n9Byt7NVExCZggaQVHoRgZpZOxV2Sioi1wKF581OaWHdg3m5XZOvrgLpsekaj4+Yf85wSlmxmZm3Q\nWXtAZmbWyTmAzMwsCQeQmZkl4QAyM7MkHEBmZpaEA8jMzJJwAJmZWRIOIDMzS6Li/hC1kgwbPIg5\nF9emLsPMrEtyD8jMzJJwAJmZWRIOIDMzS8IBZGZmSTiAzMwsCY+CMzNr5L333mP9+vVs27YtdSkV\nraqqiiFDhtCzZ8827e8AasaaDRs5d+as1GWYlcS4sakr+LDaMZX5Zw7r16+nX79+VFdXIyl1ORUp\nIti0aRPr169n2LBhbTqGL8GZmTWybds2BgwY4PBphiQGDBjQrl6iA8jMrACHT8va+ztyAJmZVaAN\nGzZwzjnnMHz4cMaMGcMxxxzD3Llzqa+v5xvf+EaL+x977LEdUGX7+B6QmVkLZpX4VnBtC7e+IoKJ\nEydy3nnn8ctf/hKAdevW8eCDD/KFL3yBmpqaFs+xcOHCUpRaVu4BmZlVmEcffZQ99tiDqVOn7lw2\ndOhQLrroIurq6jj11FMBmDFjBueffz7jx49n+PDh3HDDDTu379u3b4fX3VruAZmZVZiVK1dyxBFH\nFLXtqlWreOyxx9iyZQsHHXQQF154YZuHRXc094DMzCrc9OnTGTVqFEceeeSH1p1yyinsueeeDBw4\nkH333ZcNGzYkqLBtHEBmZhXmkEMOYfHixTvnf/rTnzJv3jw2btz4oW333HPPndM9evRg+/btHVJj\nKTiAzMwqzPHHH8+2bdu4+eabdy57++23E1ZUHkkDSFK1pBUlOM5USZNLUZOZWWqSuP/++5k/fz7D\nhg3jqKOO4rzzzuPaa69NXVpJdYlBCBFxS+oazKzramnYdDl89KMf5e677y64bvz48UBuFFy+FSt2\n/Xv+rbfeKldpJVMJl+B2l3SXpD9J+o2k3pLWShoIIKlGUp2k3SStljQoW76bpD9LGiRphqRLs+V1\nkq6V9IykFyWNzZb3lvQrSc9LmivpaUktD6Y3M7OyqIQAOgj4WUR8EtgMTCu0UUS8D8wBJmWLTgSW\nRsSH78rB7hFxFHAxcGW2bBrwRkSMAL4HjCl0Hkm1kuol1W/bWvn/gjAz66wqIYBejogF2fQc4Lhm\ntv0F0HCv53zg9ia2uy/7uQiozqaPA+4GiIgVwLJCO0bErIioiYiaqj6V/4dcZmadVSUEUBSY386u\n2qp2roh4Gdgg6XjgKOD3TRzz3eznDrrIfS4zs66mEgLoE5KOyabPAZ4A1rLrEtmXGm1/G7me0q8j\nYkcrzrMA+D8AkkYAh7W1YDMza79KCKAXgOmS/gTsDdwM/D/gXyXVk+vF5HsQ6EvTl9+a8jNgkKTn\ngR8CK4E321O4mZm1XdLLUxGxFji4wKrHgQOb2G0UucEHq/KOMyNvenze9Gvsuge0DTg3IrZJ2h94\nBFjX9urNzMqnR48eHHbYrgs1Z599NpdffnmT29fV1bHHHnt0itcwNOhU90ckXQ5cyK6RcK3RG3hM\nUk9AwLSI+Hsp6zOzrmnWotK+j6GYV5H36tWLJUuWFH3Muro6+vbtWzCAtm/fzu67V97/3VfCJbii\nRcQ1ETE0Ip5ow75bstFtoyJiZEQ0NYDBzKxiVVdXc+WVV3LEEUdw2GGHsWrVKtauXcstt9zC9ddf\nz+jRo3n88ceZMmUKU6dO5VOf+hSXXXYZr7/+OhMnTmTkyJEcffTRLFuWGwg8Y8YMvvKVr3DMMcdw\nwAEHcOuttwIwefJk7r///p3nnTRpEg888EBJ29KpAsjMrLt45513GD169M7PPffcs3PdwIEDWbx4\nMRdeeCE//vGPqa6uZurUqVxyySUsWbKEsWPHArB+/XoWLlzIT37yE6688koOP/xwli1bxo9+9CMm\nT9719LJly5bx6KOP8uSTT3LVVVfx17/+lQsuuIDZs2cD8Oabb7Jw4UJOOeWUkrax8vpkZmbW7CW4\nL37xiwCMGTOG++67r+A2AGeeeSY9evQA4IknnuDee+8Fcg873bRpE5s3bwbg9NNPp1evXvTq1YsJ\nEybwzDPPMHHiRKZNm8bGjRu59957+dKXvlTyy3gOIDOzTqbhFQwtvX6hT58+RR1PUsH5yZMnM2fO\nHO6++25uv721A49b5ktwZmZdQL9+/diyZUuT68eOHctdd90F5AYsDBw4kI985CMAPPDAA2zbto1N\nmzZRV1e388V3U6ZMYebMmQCMGDGi5DW7B2RmVoEa7gE1OOmkk7jmmmua3P60007jjDPO4IEHHuDG\nG2/80PoZM2Zw/vnnM3LkSHr37s0dd9yxc93IkSOZMGECr732Gt/73vf42Mc+BsDgwYP55Cc/ycSJ\nE0vYsl0cQGZmLShm2HSp7dhR+EEva9eu3TldU1NDXV0dAAceeODOkW3AzoEIDfbZZ58PjGrLN3Lk\nSO68884PLX/77bdZvXo1X/7yl1tZfXEcQM0YNngQcy5O8CIQM7PEHnnkES644AIuueQS9tprr7Kc\nwwFkZtaNNX6pXYMTTzyRdevK+7AYD0IwM7MkHEBmZgVENH5TjDXW3t+RA8jMrJGqqio2bdrkEGpG\nRLBp0yaqqqpa3rgJvgdkZtbIkCFDWL9+PRs3bkxdSkWrqqpiyJAhbd7fAWRm1kjPnj0ZNmxY6jK6\nPF+CMzOzJBxAZmaWhAPIzMySkEd5NE3SFuCF1HV0kIHAa6mL6EDdqb1ua9dVqe0dGhGDWtrIgxCa\n90JE1KQuoiNIqu8ubYXu1V63tevq7O31JTgzM0vCAWRmZkk4gJo3K3UBHag7tRW6V3vd1q6rU7fX\ngxDMzCwJ94DMzCyJbhlAkk6S9IKkP0u6vMB6SbohW79M0hHF7luJ2tnetZKWS1oiqb5jK2+9Itp6\nsKQnJb0r6dLW7Ftp2tnWTvW9QlHtnZT973e5pIWSRhW7b6VpZ1s7z3cbEd3qA/QA/gIMB/YAlgIj\nGm1zMvB7QMDRwNPF7ltpn/a0N1u3FhiYuh0lbOu+wJHA1cClrdm3kj7taWtn+15b0d5jgb2z6c91\n1v9u29PWzvbddsce0FHAnyPipYj4O3A3cHqjbU4H7oycp4D+kj5a5L6Vpj3t7WxabGtEvBoRzwLv\ntXbfCtOetnZGxbR3YUS8kc0+BQwpdt8K0562dirdMYA+DrycN78+W1bMNsXsW2na016AAB6RtEhS\nbdmqLI32fD+d7bttb72d6XuF1rf3AnK9+rbsm1p72gqd6Lv1kxCsJcdFxCuS9gX+IGlVRPwxdVHW\nbl32e5U0gdz/KR+XupZya6Ktnea77Y49oFeA/fLmh2TLitmmmH0rTXvaS0Q0/HwVmEvu8kClas/3\n09m+23bV28m+VyiyvZJGArcBp0fEptbsW0Ha09bO9d2mvgnV0R9yvb6XgGHsusF3SKNtTuGDN+Wf\nKXbfSvu0s719gH550wuBk1K3qT1tzdt2Bh8chNCpvtt2trVTfa/Fthf4BPBn4Ni2/q4q4dPOtnaq\n7zZ5AYm+4JOBF8mNNPlutmwqMDWbFvDTbP1yoKa5fSv909b2khuFszT7rOwM7S2irf9A7pr6ZuBv\n2fRHOuN329a2dsbvtcj23ga8ASzJPvXN7VvJn7a2tbN9t34SgpmZJdEd7wGZmVkFcACZmVkSDiAz\nM0vCAWRmZkk4gMzMLAkHkHVLknZkTwtu+FS34Rj9JU0rfXU7j//5jn5ys6SJkkZ05Dmt+/IwbOuW\nJL0VEX3beYxq4HcRcWgr9+sRETvac+5ykLQ7ub8v+V1E/CZ1Pdb1uQdklpHUQ9J1kp7N3rXy9Wx5\nX0nzJC3O3rPS8GTia4D9sx7UdZLGS/pd3vFukjQlm14r6VpJi4EzJe0v6aHsgZGPSzq4QD1TJN2U\nTc+WdLOkpyS9lJ3rF5L+JGl23j5vSbpe0sqs5kHZ8tHZvsskzZW0d7a8TtLM7L0x3wI+D1yXtWl/\nSV/Lfh9LJd0rqXdePTdk76J5SdIZeTV8K/s9LZV0TbasxfZaN5T6L2H98SfFB9jBrr8in5stqwWu\nyKb3BOrJPQ5ld3Y9LWEguUegCKgGVuQdczy53kPD/E3AlGx6LXBZ3rp5wAHZ9KeARwvUOAW4KZue\nTe6x/CL3aP7NwGHk/hG5CBidbRfApGz6+3n7LwP+dzZ9FTAzm64DfpZ3ztnAGXnzA/KmfwhclLfd\nr7PzjyD3+gDIvZtmIdA7m9+n2Pb60/0+fhq2dVfvRMToRss+A4zM+9f8XsAB5B5h8yNJ44D3yT0a\nf3AbznkP5HpU5F4o9mtJDev2LGL/30ZESFoObIiI5dnxVpILwyVZffdk288B7pO0F9A/IuZny+8g\nFx4fqKsJh0r6IdAf6As8nLfu/oh4H3heUsPv40Tg9oh4GyAiXm9He62LcwCZ7SJy/8J/+AMLc5fR\nBgFjIuI9SWuBqgL7b+eDl7Ubb7M1+7kb8LcCAdiSd7Of7+dNN8w39d9yMTd5tzazbjYwMSKWZr+H\n8QXqgdzvriltba91cb4HZLbLw8CFknoCSDpQUh9yPaFXs/CZAAzNtt8C9Mvbfx0wQtKekvoDJxQ6\nSURsBtZIOjM7jySNKlEbdgMaenDnAE9ExJvAG5LGZsu/AswvtDMfblM/4L+z38mkIs7/B+CrefeK\n9ilze60TcwCZ7XIb8DywWNIK4OfkehZ3ATXZpa/JwCqAyL2DZYGkFZKui4iXgV8BK7KfzzVzrknA\nBZIanlpcqldEbwWOyuo/ntz9HoDzyA0uWAaMzlve2N3A/5X0nKT9ge8BTwMLyNrdnIh4CHgQqJe0\nBLg0W1Wu9lon5mHYZl1IKYaXm3UU94DMzCwJ94DMzCwJ94DMzCwJB5CZmSXhADIzsyQcQGZmloQD\nyMzMknAAmZlZEv8f4ymL6f1hnIoAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x208f5040978>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.barh(range(len(columns)),dtree_gini.feature_importances_,align='center',alpha=0.4,color='b',label='Gini')\n",
    "plt.barh(range(len(columns)),dtree_entropy.feature_importances_,align='center',alpha=0.4,color='g',label='Entropy')\n",
    "plt.yticks(range(len(columns)),cat_features)\n",
    "plt.xlabel('Feature importance')\n",
    "plt.ylabel('Feature')\n",
    "plt.legend(loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2- Faça o balanceamento dos dados contidos em \"train.csv\", aplique o algoritmo de Decision Tree e faça a submissão no kaggle. Tente melhorar o resultado obtido em sala de aula (posição 3100 no leaderboard)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Carrega os dados\n",
    "portodf = pd.read_csv('train/train.csv')\n",
    "testsdf = pd.read_csv('test/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>target</th>\n",
       "      <th>ps_ind_01</th>\n",
       "      <th>ps_ind_02_cat</th>\n",
       "      <th>ps_ind_03</th>\n",
       "      <th>ps_ind_04_cat</th>\n",
       "      <th>ps_ind_05_cat</th>\n",
       "      <th>ps_ind_06_bin</th>\n",
       "      <th>ps_ind_07_bin</th>\n",
       "      <th>ps_ind_08_bin</th>\n",
       "      <th>...</th>\n",
       "      <th>ps_calc_11</th>\n",
       "      <th>ps_calc_12</th>\n",
       "      <th>ps_calc_13</th>\n",
       "      <th>ps_calc_14</th>\n",
       "      <th>ps_calc_15_bin</th>\n",
       "      <th>ps_calc_16_bin</th>\n",
       "      <th>ps_calc_17_bin</th>\n",
       "      <th>ps_calc_18_bin</th>\n",
       "      <th>ps_calc_19_bin</th>\n",
       "      <th>ps_calc_20_bin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 59 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \\\n",
       "0   7       0          2              2          5              1   \n",
       "1   9       0          1              1          7              0   \n",
       "2  13       0          5              4          9              1   \n",
       "3  16       0          0              1          2              0   \n",
       "4  17       0          0              2          0              1   \n",
       "\n",
       "   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin       ...        \\\n",
       "0              0              0              1              0       ...         \n",
       "1              0              0              0              1       ...         \n",
       "2              0              0              0              1       ...         \n",
       "3              0              1              0              0       ...         \n",
       "4              0              1              0              0       ...         \n",
       "\n",
       "   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \\\n",
       "0           9           1           5           8               0   \n",
       "1           3           1           1           9               0   \n",
       "2           4           2           7           7               0   \n",
       "3           2           2           4           9               0   \n",
       "4           3           1           1           3               0   \n",
       "\n",
       "   ps_calc_16_bin  ps_calc_17_bin  ps_calc_18_bin  ps_calc_19_bin  \\\n",
       "0               1               1               0               0   \n",
       "1               1               1               0               1   \n",
       "2               1               1               0               1   \n",
       "3               0               0               0               0   \n",
       "4               0               0               1               1   \n",
       "\n",
       "   ps_calc_20_bin  \n",
       "0               1  \n",
       "1               0  \n",
       "2               0  \n",
       "3               0  \n",
       "4               0  \n",
       "\n",
       "[5 rows x 59 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Visualizando os dados\n",
    "portodf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    573518\n",
       "1     21694\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "portodf.target.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dados desbalanceados."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Balanceando os dados\n",
    "samples = 25000\n",
    "\n",
    "unbalanced_idx = portodf[portodf.target==0].index\n",
    "balanced_idx = portodf[portodf.target==1].index\n",
    "\n",
    "selected_idx1 = set(np.random.choice(unbalanced_idx,samples))\n",
    "selected_idx = selected_idx1.union(set(balanced_idx))\n",
    "sampled_portodf = portodf.loc[selected_idx]\n",
    "#sampled_portodf = pd.DataFrame(sampled_portodf.values,columns=sampled_portodf.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dividindo em treino e teste\n",
    "def split_train(data):    \n",
    "    msk = np.random.rand(len(data))<0.7\n",
    "    xtrain = data.drop(['target'],axis=1).iloc[msk]\n",
    "    ytrain = data.target[msk]\n",
    "\n",
    "    xtest = data.drop(['target'],axis=1).iloc[~msk]\n",
    "    ytest = sampled_portodf.target[~msk]\n",
    "    \n",
    "    return xtrain,ytrain,xtest,ytest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.96      0.95      0.96    172026\n",
      "          1       0.04      0.06      0.05      6680\n",
      "\n",
      "avg / total       0.93      0.92      0.92    178706\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Instanciando uma árvore de decisão\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "xtrain,ytrain,xtest,ytest = split_train(sampled_portodf)\n",
    "dtree = DecisionTreeClassifier()\n",
    "dtree.fit(xtrain,ytrain)\n",
    "predictions = dtree.predict(xtest)\n",
    "print(classification_report(ytest,predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "        0.0       0.56      0.55      0.55      7303\n",
      "        1.0       0.50      0.51      0.50      6483\n",
      "\n",
      "avg / total       0.53      0.53      0.53     13786\n",
      "\n"
     ]
    }
   ],
   "source": [
    "xtrain,ytrain,xtest,ytest = split_train(sampled_portodf)\n",
    "dtree = DecisionTreeClassifier()\n",
    "dtree.fit(xtrain,ytrain)\n",
    "predictions = dtree.predict(xtest)\n",
    "print(classification_report(ytest,predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apenas amostrando o dataset para torná-lo mais balanceado, nota-se uma melhora em ambas as métrica precision/recall."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = dtree.predict(testsdf)\n",
    "submission = pd.DataFrame()\n",
    "submission['id'] = testsdf.iloc[:, 0]\n",
    "submission['target'] = predictions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "submission.to_csv('kaggle_submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    489477\n",
       "1.0    403339\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission.target.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
